The content of this kernel will cover two parts.
Both parts will be finished with a two-step process: NLP and KNN model fitting. While the first part will be analyzed with the text of questions while the second part will use the text of answers to solve and analyze.
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from wordcloud import WordCloud,STOPWORDS
Questions=pd.read_csv('./Questions.csv',encoding = 'iso-8859-1')
Answers=pd.read_csv('./Answers.csv',encoding = 'iso-8859-1')
In [2]:
User_id_inQ= Questions['OwnerUserId'].unique()
User_id_inA= Answers['OwnerUserId'].unique()
In [3]:
All_id=set(User_id_inQ).intersection(User_id_inA)
In [4]:
print('So we have '+str(len(All_id))+ \
' users that post both questions and answers on StackOverFlow')
In [5]:
users=pd.DataFrame({'idUser':list(All_id)})
In [6]:
users.head()
Out[6]:
In [7]:
users['Quantity']=users['idUser'].apply(lambda x: \
len(Questions[Questions['OwnerUserId']==x]['Body']) \
+len(Answers[Answers['OwnerUserId']==x]['Body']))
In [8]:
users.head()
Out[8]:
In [9]:
users_final=users.sort(['Quantity'],ascending=0).reset_index(drop=True)
users_final.head()
Out[9]:
In [10]:
users_final=users_final.iloc[0:10000,]
users_final.shape
Out[10]:
In [11]:
All_id=list(users_final['idUser'])
Firstly, create a function that can clean the body of questions and answers. Only the main body of questions will be used.
In [12]:
# remove the code part from questions
body = Questions['Body'].str.replace(r'<code>[^<]+</code>',' ')
# build up the question part from questions
Questions['QuestionBody'] = body.str.replace(r"<[^>]+>|\n|\r", " ")
In [13]:
Questions.head()
Out[13]:
In [15]:
# remove the code part from questions
body = Answers['Body'].str.replace(r'<code>[^<]+</code>',' ')
# build up the question part from questions
Answers['QuestionBody'] = body.str.replace(r"<[^>]+>|\n|\r", " ")
In [16]:
Answers.head()
Out[16]:
In [17]:
Q_data=Questions[['OwnerUserId','QuestionBody']]
A_data=Answers[['OwnerUserId','QuestionBody']]
Question=Q_data[Q_data['OwnerUserId'].isin(All_id)]
Answer=A_data[A_data['OwnerUserId'].isin(All_id)]
In [25]:
Question.head()
Out[25]:
In [26]:
Answer.head()
Out[26]:
In [29]:
Answer.shape
Out[29]:
In [32]:
Answer['QuestionBody'][6]
Out[32]:
In [35]:
type(Question.QuestionBody)
Out[35]:
In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
Q_features=tfidf.fit_transform(Question.QuestionBody)
A_features=tfidf.fit_transform(Answer.QuestionBody)
In [36]:
type(Q_features)
Out[36]:
In [37]:
Q_features
Out[37]:
In [38]:
A_features
Out[38]: